# Loading required packages
import bambi as bmb
import scipy as sp
import numpy as np
import arviz as az
import polars as pl
import plotnine as p9
import matplotlib.pyplot as pltSimulating data
az.style.use("arviz-darkgrid")
np.random.seed(1996)One method for validating proposed models is to generate a known dataset and then fit your model to the given dataset. If we observe difficulties fitting the model we need to reconsider the parameterisation or the priors.
Experiment
Given that there are two strains we want characterise the amount of protein each cell line produced. We have access to sets of reactors to perform this experiment, and the protein concentrations will be measured using mass spectroscopy that has a log-normal measurement model.
Notes
- Often we want to make statements about strains rather than reactors.
- If we assume that the reactor to reactor variation is the same between strains then we are able to improve our fits using a heirarchical modelling approach called “partial pooling”,
# Generating the dataset
strain_growth_array = [0.4, 0.425, 0.5]
strain_growth_map = {
strain: np.log(val)
for strain, val in enumerate(strain_growth_array)
}
sigma_reactor_effect = 0.05
sigma_quant = 0.03
n_reactors = 2
n_replicates = 4
protein_measurements = { strain:
[float(np.exp(np.log(strain_growth_array[strain]) + np.random.normal(0, 1)*sigma_reactor_effect)) for _ in range(n_reactors)]
for strain, _ in enumerate(strain_growth_array)
}
reactor_count = 0
raw_data = []
for strain, _ in enumerate(strain_growth_array):
for reactor, _ in enumerate(protein_measurements[strain]):
for replicate in range(n_replicates):
raw_data.append(
{
"strain": strain,
"reactor": reactor_count,
"prot": np.log(protein_measurements[strain][reactor]) + np.random.normal(0,1)*sigma_quant,
}
)
reactor_count+=1
data = pl.from_dicts(raw_data)
data| strain | reactor | prot |
|---|---|---|
| i64 | i64 | f64 |
| 0 | 0 | -0.875719 |
| 0 | 0 | -0.879013 |
| 0 | 0 | -0.923178 |
| 0 | 0 | -0.923928 |
| 0 | 1 | -0.941512 |
| … | … | … |
| 2 | 4 | -0.625698 |
| 2 | 5 | -0.61643 |
| 2 | 5 | -0.677768 |
| 2 | 5 | -0.672501 |
| 2 | 5 | -0.63258 |
strain_growth_map{0: np.float64(-0.916290731874155),
1: np.float64(-0.8556661100577202),
2: np.float64(-0.6931471805599453)}
Confidence Interval
alpha = 0.05
summary_stats = data.group_by("strain").agg(
mean = pl.mean("prot"),
SEM = pl.std("prot") / pl.col("prot").count(),
count = pl.col("prot").count(),
)
summary_stats = summary_stats.with_columns(
(
pl.col("mean") +
(pl.col("count") - 1).map_elements(lambda df: sp.stats.t.ppf(1 - alpha / 2, df), return_dtype=pl.Float64) * pl.col("SEM")
).alias("upp_ci"),
(
pl.col("mean") -
(pl.col("count") - 1).map_elements(lambda df: sp.stats.t.ppf(1 - alpha / 2, df), return_dtype=pl.Float64) * pl.col("SEM")
).alias("low_ci"),
pl.col("strain").replace_strict(strain_growth_map).alias("true_protein")
)
summary_stats| strain | mean | SEM | count | upp_ci | low_ci | true_protein |
|---|---|---|---|---|---|---|
| i64 | f64 | f64 | u32 | f64 | f64 | f64 |
| 0 | -0.928906 | 0.004705 | 8 | -0.917779 | -0.940033 | -0.916291 |
| 2 | -0.644822 | 0.003288 | 8 | -0.637047 | -0.652597 | -0.693147 |
| 1 | -0.877932 | 0.007293 | 8 | -0.860688 | -0.895176 | -0.855666 |
sp.stats.ttest_ind(data.filter(pl.col("strain")==0)["prot"], data.filter(pl.col("strain")==1)["prot"], equal_var=True)TtestResult(statistic=np.float64(-3.5517718345604115), pvalue=np.float64(0.003189683995315531), df=np.float64(14.0))
Statistical model
The model we are trying to fit in this instance is as follows \[\mu_{reactor} \sim LogNormal(\mu_{true, reactor}, \sigma_{quant})\] \[\mu_{true, reactor} \sim LogNormal(\mu_{true, strain}, \sigma_{biological})\]
With priors \[\sigma_{quant} \sim log_normal(log(0.01), 0.1)\] \[\sigma_{biological} \sim HalfNormal(0.03)\]
priors1 = {
"strain": bmb.Prior("Normal", mu=-0.7, sigma=1),
"sigma": bmb.Prior("LogNormal", mu=np.log(sigma_quant), sigma=0.2), # measurement error
"1|reactor_sigma": bmb.Prior("HalfNormal", sigma=sigma_reactor_effect),
}
model1 = bmb.Model(
"prot ~ 0 + strain + (1|reactor)",
data=data.to_pandas(),
categorical=["strain", "reactor"],
priors=priors1,
noncentered=False
)
results1 = model1.fit(tune=2000, nuts_sampler_kwargs={"target_accept": 0.99}, idata_kwargs={"log_likelihood": True})Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, strain, 1|reactor_sigma, 1|reactor]
Sampling 4 chains for 2_000 tune and 1_000 draw iterations (8_000 + 4_000 draws total) took 3 seconds.
strain_growth_map{0: np.float64(-0.916290731874155),
1: np.float64(-0.8556661100577202),
2: np.float64(-0.6931471805599453)}
az.plot_forest(results1, var_names=["strain"], combined=True)array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)
priors2 = {
"strain": bmb.Prior("Normal", mu=-0.7, sigma=1),
"sigma": bmb.Prior("LogNormal", mu=np.log((1/((1/sigma_quant)+(1/(sigma_reactor_effect))))), sigma=0.1), # measurement error
}
model2 = bmb.Model(
"prot ~ 0 + strain",
data=data.to_pandas(),
categorical=["strain", "reactor"],
priors=priors2,
noncentered=False
)
results2 = model2.fit(tune=2000, nuts_sampler_kwargs={"target_accept": 0.99}, idata_kwargs={"log_likelihood": True})Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, strain]
Sampling 4 chains for 2_000 tune and 1_000 draw iterations (8_000 + 4_000 draws total) took 1 seconds.
np.log(strain_growth_array)array([-0.91629073, -0.85566611, -0.69314718])
az.plot_forest(results2, var_names=["strain"], combined=True)array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)
models_dict = {
"model1": results1,
"model2": results2,
}
df_compare = az.compare(models_dict)
df_compare/home/nicholas/projects/bayesian_statistics_for_computational_biology/.venv/lib/python3.12/site-packages/arviz/stats/stats.py:830: UserWarning: Estimated shape parameter of Pareto distribution is greater than 0.70 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.
| rank | elpd_loo | p_loo | elpd_diff | weight | se | dse | warning | scale | |
|---|---|---|---|---|---|---|---|---|---|
| model1 | 0 | 43.990324 | 6.317852 | 0.000000 | 1.000000e+00 | 4.077870 | 0.000000 | False | log |
| model2 | 1 | 25.383249 | 10.043843 | 18.607074 | 1.953993e-14 | 12.545958 | 10.079173 | True | log |